Goals

  1. Load in phyloseq data with rooted tree.
  2. Evaluate sequencing depth and remove sample.
  3. Normalize the read counts between samples.
  4. Calculate community dissimilarities. Numbers between 0 and 1. If 0, completely similar versus if they are 1, then they’re completely dissimilar.
    1. Sorensen: Shared Species as a binary value: Abundance-unweighted
    2. Bray-Curtis: Shared Abundant species: Abundance-weighted
    3. (Abundance-)Weighted UNIFRAC: Consider Abundant Species and where they fall on the tree
  5. Visualize the community data with two unconstrained Ordinations:
    1. PCoA: Linear Method. Eigenvalue = how much variation is explained by each axis. Choose to view axis 1, 2, 3, etc. and plot them together.
    2. NMDS: Non-linear. Smush multiple Dimensions into 2 or 3 axes. Need to report Stress value (ideally <0.15).
  6. Run statistics with PERMANOVA and betadispR.

Setup

Set the seed

# Any number can be chosen 
set.seed(238428)

Load Libraries

#install.packages("vegan")
pacman::p_load(tidyverse, devtools, phyloseq, patchwork, vegan,
               install = FALSE)

# Load Station colors 
station_colors <- c(
  "Shipping Channel" = "dodgerblue4",
  "Aransas Bay" = "dodgerblue2",
  "Copano West" = "#D9CC3C",
  "Copano East" = "#A0E0BA",
  "Mesquite Bay" = "#00ADA7")

Load Data

# Load in rooted phylogenetic tree! 
load("data/03_Phylogenetic_Tree/phytree_preprocessed_physeq.RData")
midroot_physeq_rm456
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2676 taxa and 89 samples ]
## sample_data() Sample Data:       [ 89 samples by 47 sample variables ]
## tax_table()   Taxonomy Table:    [ 2676 taxa by 9 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2676 tips and 2675 internal nodes ]
unrooted_physeq_rm456
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2676 taxa and 89 samples ]
## sample_data() Sample Data:       [ 89 samples by 47 sample variables ]
## tax_table()   Taxonomy Table:    [ 2676 taxa by 9 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2676 tips and 2674 internal nodes ]

Explore Read Counts

Raw read depth

# Calculate the total number of reads per sample. 
raw_TotalSeqs_df  <- 
  midroot_physeq_rm456 %>%
  # calculate the sample read sums 
  sample_sums() %>%
  data.frame()
# name the column 
colnames(raw_TotalSeqs_df)[1] <- "TotalSeqs"
  
head(raw_TotalSeqs_df)
##                   TotalSeqs
## 20210602-MA-ABB1F      4024
## 20210602-MA-ABB1P      3086
## 20210602-MA-ABB1W      5719
## 20210602-MA-ABS1F      3824
## 20210602-MA-ABS1P      3743
## 20210602-MA-ABS1W      4742
# make histogram of raw reads 
raw_TotalSeqs_df %>%
  ggplot(aes(x = TotalSeqs)) + 
  geom_histogram(bins = 50) + 
  scale_x_continuous(limits = c(0, 10000)) + 
  labs(title = "Raw Sequencing Depth Distribution") + 
  theme_classic()
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Remove lowly seq sample

raw_rooted_physeq <- 
  midroot_physeq_rm456 %>%
  # remove lowly seq sample that was outlier in alpha diversity analysis
  subset_samples(names != "20210615-MA-ABB2F") %>%
  # any asvs unique to this sample will also be removed 
  prune_taxa(taxa_sums(.) > 0, .)

# Inspect 
raw_rooted_physeq
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2669 taxa and 88 samples ]
## sample_data() Sample Data:       [ 88 samples by 47 sample variables ]
## tax_table()   Taxonomy Table:    [ 2669 taxa by 9 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2669 tips and 2668 internal nodes ]
# what is the minimum number of sequences 
raw_rooted_physeq %>%
  sample_sums() %>%
  min()
## [1] 2194

Normalize read counts

### scale_reads function
#################################################################################### 
# Function to scale reads: http://deneflab.github.io/MicrobeMiseq/ 
# Scales reads by 
# 1) taking proportions
# 2) multiplying by a given library size of n
# 3) rounding 
# Default for n is the minimum sample size in your library
# Default for round is floor

matround <- function(x){trunc(x+0.5)}

scale_reads <- function(physeq, n = min(sample_sums(physeq)), round = "round") {
  
  # transform counts to n
  physeq.scale <- transform_sample_counts(physeq, function(x) {(n * x/sum(x))})
  
  # Pick the rounding functions
  if (round == "floor"){
    otu_table(physeq.scale) <- floor(otu_table(physeq.scale))
  } else if (round == "round"){
    otu_table(physeq.scale) <- round(otu_table(physeq.scale))
  } else if (round == "matround"){
    otu_table(physeq.scale) <- matround(otu_table(physeq.scale))
  }
  
  # Prune taxa and return new phyloseq object
  physeq.scale <- prune_taxa(taxa_sums(physeq.scale) > 0, physeq.scale)
  return(physeq.scale)
}

Scale the reads and check the distribution of the seq depth

This is where one might decide use rarefaction to normalize data.

min(sample_sums(raw_rooted_physeq))
## [1] 2194
# Scale reads by the above function
scaled_rooted_physeq <- 
  raw_rooted_physeq %>%
  scale_reads(round = "matround")

# Calculate the read depth 
scaled_TotalSeqs_df <- 
  scaled_rooted_physeq %>%
  sample_sums() %>%
  data.frame()
colnames(scaled_TotalSeqs_df)[1] <-"TotalSeqs"

# Inspect
head(scaled_TotalSeqs_df)
##                   TotalSeqs
## 20210602-MA-ABB1F      2192
## 20210602-MA-ABB1P      2184
## 20210602-MA-ABB1W      2195
## 20210602-MA-ABS1F      2189
## 20210602-MA-ABS1P      2197
## 20210602-MA-ABS1W      2195
# Check the range of the data 
min_seqs <- min(scaled_TotalSeqs_df$TotalSeqs); min_seqs
## [1] 2180
max_seqs <- max(scaled_TotalSeqs_df$TotalSeqs); max_seqs
## [1] 2225
# range
max_seqs - min_seqs
## [1] 45
# Plot Histogram 
scaled_TotalSeqs_df %>%
  ggplot(aes(x = TotalSeqs)) + 
  geom_histogram(bins = 50) + 
  scale_x_continuous(limits = c(0, 10000)) + 
  labs(title = "Scaled Sequencing Depth at 2194") + 
  theme_classic()
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Cacluate & Visualize community dissimilarity

Exploratory analyses from Paliy & Shankar (2016) paper, which is using unconstrained ordination methods like PCoA.

PCoA

Sorensen

# Calculate sorensen dissimilarity: Abundance-unweighted of shared taxa
scaled_soren_pcoa <-  
  ordinate(
    physeq = scaled_rooted_physeq,
    method = "PCoA",
    distance = "bray", binary = TRUE)

#str(scaled_soren_pcoa)

# Plot the ordination 
soren_station_pcoa <- plot_ordination(
  physeq = scaled_rooted_physeq,
  ordination = scaled_soren_pcoa,
  color = "station",
  shape = "station",
  title = "Sorensen PCoA") +
  geom_point(size=5, alpha = 0.5, aes(color = station)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = station_colors) + 
  theme_bw()
# Show the plot 
soren_station_pcoa

Note that I have removed the PERMANOVA test to below the ordinations. We will come back to it later!

Here, we are evaluating the shared taxa by presence/absence (abundance-unweighted) in the Sorensen metric.

Note that there is a weird arch or horseshoe-effect in the data. This is likely because we have complete species turnover between our stations. If you have this in your data, please take a look at the paper by Morton et al., 2017. More on this in NMDS…

Note that:

  • Axis 1 = ~21% of variation
  • Axis 2 = ~15% of variation

This means we explain 36% of the variation in the data in these two axes.

Bray

# Calculate the BC distance
scaled_BC_pcoa <- 
  ordinate(
    physeq = scaled_rooted_physeq,
    method = "PCoA",
    distance = "bray")

# Plot the PCoA
bray_station_pcoa <- 
  plot_ordination(
    physeq = scaled_rooted_physeq,
    ordination = scaled_BC_pcoa,
    color = "station",
    shape = "station",
    title = "Bray-Curtis PCoA") +
  geom_point(size=5, alpha = 0.5, aes(color = station)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(station_colors)) + 
  theme_bw()
bray_station_pcoa

Here, we are evaluating the shared taxa and then weighting them by their abundances, which provides more influence for species that are more abundant.

Note that there is a weird arch or horseshoe-effect in the data. This is likely because we have complete species turnover between our stations. If you have this in your data, please take a look at the paper by Morton et al., 2017. More on this in NMDS…

Note that:

  • Axis 1 = ~28% of variation
  • Axis 2 = ~20% of variation

This means we explain 48% of the variation in the data in these two axes, which is more than the previous plot with the Sorensen Dissimilarity. Abundance does seem to have an influence!!

It also looks like the samples are now separating more within each group than they did a bit more with Sorensen. Please note this with how the Copano West samples are looking between the Bray-Curtis and the Sorensen plots.

Weighted Unifrac

# Calculate the BC distance
scaled_wUNI_pcoa <- 
  ordinate(
    physeq = scaled_rooted_physeq,
    method = "PCoA",
    distance = "wunifrac")

# Plot the PCoA
wUNI_station_pcoa <- 
  plot_ordination(
    physeq = scaled_rooted_physeq,
    ordination = scaled_wUNI_pcoa,
    color = "station",
    shape = "station",
    title = "Weighted Unifrac PCoA") +
  geom_point(size=5, alpha = 0.5, aes(color = station)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(station_colors)) + 
  theme_bw()
wUNI_station_pcoa

Here, we are evaluating the shared taxa and then weighting them by their abundances, which provides more influence for species that are more abundant.

Note that there is a weird arch or horseshoe-effect in the data. This is likely because we have complete species turnover between our stations. If you have this in your data, please take a look at the paper by Morton et al., 2017. More on this in NMDS…

Note that:

  • Axis 1 = ~41% of variation
  • Axis 2 = ~29% of variation

This means we explain 70% of the variation in the data in these two axes (!!!), which is significantly more than the previous plots with the taxonomic dissimilarity measures. Here, phylogeny seems to be very important! This means that taxa that are abundant are found in different places in the phylogenetic tree. Therefore, the evoultionary distances (aka the branch lengths) and their abundances seem to have a major influence!!

It also looks like the samples are now separating more within each group than they did a bit more with Sorensen. Please note this with how the Copano West samples are looking between the Bray-Curtis and the Sorensen plots.

Combine PCoAs

Let’s plot all three together into one plot to have a concise visualization of the three metrics.

(soren_station_pcoa + theme(legend.position = "none")) + 
  (bray_station_pcoa + theme(legend.position = "none")) + 
    (wUNI_station_pcoa + theme(legend.position = "none"))

NMDS

Weighted Unifrac

Since we did 3 of the dissimilarity metrics for the PCoA, let’s just plot one example of them for the NMDS plotting. Here, we will use weighted Unifrac

# Calculate the Weighted Unifrac distance
scaled_wUNI_nmds <- 
  ordinate(
    physeq = scaled_rooted_physeq,
    method = "NMDS",
    distance = "wunifrac")
## Run 0 stress 0.08850802 
## Run 1 stress 0.1820551 
## Run 2 stress 0.08850781 
## ... New best solution
## ... Procrustes: rmse 0.0002277753  max resid 0.001768078 
## ... Similar to previous best
## Run 3 stress 0.2077088 
## Run 4 stress 0.08850869 
## ... Procrustes: rmse 0.001242072  max resid 0.008811312 
## ... Similar to previous best
## Run 5 stress 0.08850773 
## ... New best solution
## ... Procrustes: rmse 0.0001066012  max resid 0.0008692466 
## ... Similar to previous best
## Run 6 stress 0.187642 
## Run 7 stress 0.2135437 
## Run 8 stress 0.08850789 
## ... Procrustes: rmse 8.698477e-05  max resid 0.0006694909 
## ... Similar to previous best
## Run 9 stress 0.08850788 
## ... Procrustes: rmse 0.0001231422  max resid 0.00100307 
## ... Similar to previous best
## Run 10 stress 0.08850795 
## ... Procrustes: rmse 0.000102928  max resid 0.0008064815 
## ... Similar to previous best
## Run 11 stress 0.0885085 
## ... Procrustes: rmse 0.001312513  max resid 0.008988158 
## ... Similar to previous best
## Run 12 stress 0.1992754 
## Run 13 stress 0.1682702 
## Run 14 stress 0.08850869 
## ... Procrustes: rmse 0.001282848  max resid 0.00895487 
## ... Similar to previous best
## Run 15 stress 0.08850805 
## ... Procrustes: rmse 0.0001822032  max resid 0.001499852 
## ... Similar to previous best
## Run 16 stress 0.08850798 
## ... Procrustes: rmse 0.0001135539  max resid 0.000870369 
## ... Similar to previous best
## Run 17 stress 0.1923846 
## Run 18 stress 0.08850795 
## ... Procrustes: rmse 0.0001606615  max resid 0.001241155 
## ... Similar to previous best
## Run 19 stress 0.08850874 
## ... Procrustes: rmse 0.001272308  max resid 0.008930962 
## ... Similar to previous best
## Run 20 stress 0.1471595 
## *** Best solution repeated 10 times
# Plot the PCoA
wUNI_station_nmds <- 
  plot_ordination(
    physeq = scaled_rooted_physeq,
    ordination = scaled_wUNI_nmds,
    color = "station",
    shape = "station",
    title = "Weighted Unifrac NMDS") +
  geom_point(size=5, alpha = 0.5, aes(color = station)) +
  scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
  scale_color_manual(values = c(station_colors)) + 
  theme_bw()
wUNI_station_nmds

We can see from above the plot that the stress value is ~0.15, which is just barely at the limit of the acceptable stress value. And, It seems important to emphasize that the PCoA and the NMDS plot both look pretty similar!

In this case, I would always prefer to report the PCoA results because they are linear and provide a lot more post-hoc analyses to follow up with. In addition, it’s helpful to only have 2 axes of variation and show how much variation is explained.

(wUNI_station_pcoa + theme(legend.position = "none")) + 
  (wUNI_station_nmds + theme(legend.position = "none"))

Statistical Significance Testing

PERMANOVA

# Calculate all three of the distance matrices
scaled_sorensen_dist <- phyloseq::distance(scaled_rooted_physeq, method = "bray", binary = TRUE)
scaled_bray_dist <- phyloseq::distance(scaled_rooted_physeq, method = "bray")
scaled_wUnifrac_dist <- phyloseq::distance(scaled_rooted_physeq, method = "wunifrac")

# make a data frame from the sample_data
# All distance matrices will be the same metadata because they 
# originate from the same phyloseq object. 
metadata <- data.frame(sample_data(scaled_rooted_physeq))

# Adonis test
# In this example we are testing the hypothesis that the five stations
# that were collected have different centroids in the ordination space 
# for each of the dissimilarity metrics, we are using a discrete variable 
adonis2(scaled_sorensen_dist ~ station, data = metadata)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_sorensen_dist ~ station, data = metadata)
##          Df SumOfSqs      R2      F Pr(>F)    
## station   4   6.3716 0.29265 8.5848  0.001 ***
## Residual 83  15.4007 0.70735                  
## Total    87  21.7723 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_bray_dist ~ station, data = metadata)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_bray_dist ~ station, data = metadata)
##          Df SumOfSqs      R2      F Pr(>F)    
## station   4   7.3069 0.38689 13.094  0.001 ***
## Residual 83  11.5794 0.61311                  
## Total    87  18.8863 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_wUnifrac_dist ~ station, data = metadata)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_wUnifrac_dist ~ station, data = metadata)
##          Df SumOfSqs      R2      F Pr(>F)    
## station   4   1.1233 0.45288 17.176  0.001 ***
## Residual 83   1.3571 0.54712                  
## Total    87   2.4804 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Note that:

  • R2 = the percent variation explained.
  • F = the F-Statistic, which represents the importance value.
  • Pr(>F) = the pvalue

Above, we see that the most variation is explained by the weighted unifrac, which explains ~45% of the variation in the data and also has the highest F-statistic.

# We might also care about other variables
# Here, we will add date and fraction as variables
# multiplicative model ORDER MATTERS! 
adonis2(scaled_sorensen_dist ~ station * date * fraction, data = metadata)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_sorensen_dist ~ station * date * fraction, data = metadata)
##                       Df SumOfSqs      R2       F Pr(>F)    
## station                4   6.3716 0.29265 12.7392  0.001 ***
## date                   2   2.3935 0.10993  9.5710  0.001 ***
## fraction               2   0.5172 0.02376  2.0683  0.002 ** 
## station:date           8   3.9412 0.18102  3.9399  0.001 ***
## station:fraction       8   0.9169 0.04212  0.9167  0.693    
## date:fraction          4   0.5037 0.02313  1.0070  0.443    
## station:date:fraction 16   1.7514 0.08044  0.8754  0.894    
## Residual              43   5.3767 0.24695                   
## Total                 87  21.7723 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_bray_dist ~ station * date * fraction, data = metadata)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_bray_dist ~ station * date * fraction, data = metadata)
##                       Df SumOfSqs      R2       F Pr(>F)    
## station                4   7.3069 0.38689 23.2209  0.001 ***
## date                   2   2.0117 0.10651 12.7859  0.001 ***
## fraction               2   0.7505 0.03974  4.7698  0.001 ***
## station:date           8   3.4783 0.18417  5.5270  0.001 ***
## station:fraction       8   0.7016 0.03715  1.1149  0.280    
## date:fraction          4   0.3056 0.01618  0.9713  0.502    
## station:date:fraction 16   0.9490 0.05025  0.7540  0.965    
## Residual              43   3.3827 0.17911                   
## Total                 87  18.8863 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Note that the ORDER MATTERS!
adonis2(scaled_wUnifrac_dist ~ station * date * fraction, data = metadata)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_wUnifrac_dist ~ station * date * fraction, data = metadata)
##                       Df SumOfSqs      R2       F Pr(>F)    
## station                4  1.12332 0.45288 27.9816  0.001 ***
## date                   2  0.17318 0.06982  8.6277  0.001 ***
## fraction               2  0.22887 0.09227 11.4023  0.001 ***
## station:date           8  0.32313 0.13027  4.0245  0.001 ***
## station:fraction       8  0.07703 0.03106  0.9594  0.530    
## date:fraction          4  0.02648 0.01068  0.6596  0.824    
## station:date:fraction 16  0.09684 0.03904  0.6031  0.995    
## Residual              43  0.43156 0.17399                   
## Total                 87  2.48042 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(scaled_wUnifrac_dist ~ date * station * fraction, data = metadata)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = scaled_wUnifrac_dist ~ date * station * fraction, data = metadata)
##                       Df SumOfSqs      R2       F Pr(>F)    
## date                   2  0.17259 0.06958  8.5984  0.001 ***
## station                4  1.12391 0.45311 27.9963  0.001 ***
## fraction               2  0.22887 0.09227 11.4023  0.001 ***
## date:station           8  0.32313 0.13027  4.0245  0.001 ***
## date:fraction          4  0.02633 0.01062  0.6559  0.852    
## station:fraction       8  0.07718 0.03111  0.9612  0.549    
## date:station:fraction 16  0.09684 0.03904  0.6031  0.989    
## Residual              43  0.43156 0.17399                   
## Total                 87  2.48042 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can also run tests that include additive (+) or multipliciatve models, which include the interaction term between variables.

BetaDispR

The PERMANOVA is sensitive to variance/dispersion in the data. Therefore, we need to run a homogeneity of dispersion test to test for the sensitivity of our PERMANOVA results to variance.

# Homogeneity of Disperson test with beta dispr
# Sorensen 
beta_soren_station <- betadisper(scaled_sorensen_dist, metadata$station)
permutest(beta_soren_station)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)    
## Groups     4 0.18429 0.046073 6.3863    999  0.001 ***
## Residuals 83 0.59878 0.007214                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Bray-curtis 
beta_bray_station <- betadisper(scaled_bray_dist, metadata$station)
permutest(beta_bray_station)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)    
## Groups     4 0.37117 0.092794 7.0268    999  0.001 ***
## Residuals 83 1.09607 0.013206                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Weighted Unifrac 
beta_bray_station <- betadisper(scaled_wUnifrac_dist, metadata$station)
permutest(beta_bray_station)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
## 
## Response: Distances
##           Df   Sum Sq  Mean Sq     F N.Perm Pr(>F)   
## Groups     4 0.042101 0.010525 4.195    999  0.005 **
## Residuals 83 0.208244 0.002509                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Above, our variance is impacted by station. Therefore, we need to be very careful about what we conclude about our data.

Taxonomic Composition

Phylum

# Set the phylum colors
phylum_colors <- c(
  Acidobacteriota = "navy", 
  Actinobacteriota = "darkslategray2", 
  Armatimonadota = "deeppink1",
  Alphaproteobacteria = "plum2", 
  Bacteroidota = "gold", 
  Betaproteobacteria = "plum1", 
  Bdellovibrionota = "red1",
  Chloroflexi="black", 
  Crenarchaeota = "firebrick",
  Cyanobacteria = "limegreen",
  Deltaproteobacteria = "grey", 
  Desulfobacterota="magenta",
  Firmicutes = "#3E9B96",
  Gammaproteobacteria = "greenyellow",
  "Marinimicrobia (SAR406 clade)" = "yellow",
  Myxococcota = "#B5D6AA",
  Nitrospirota = "palevioletred1",
  Proteobacteria = "royalblue",
  Planctomycetota = "darkorange", 
  "SAR324 clade(Marine group B)" = "olivedrab",
  #Proteobacteria_unclassified = "greenyellow",
  Thermoplasmatota = "green",
  Verrucomicrobiota = "darkorchid1")
 # Other = "grey")
# Calculate the phylum relative abundance 
# Note: The read depth MUST be normalized in some way: scale_reads
phylum_df <- 
  scaled_rooted_physeq %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Phylum") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) %>%
  # fix the order of date
  mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
         station = fct_relevel(station, c("Copano West", "Copano East",
                                          "Mesquite Bay", "Aransas Bay",
                                          "Shipping Channel")))

# Stacked Bar Plot With All phyla 
# Plot Phylum Abundances - make sure to load phylum_colors 
phylum_df %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(depth == 0.0) %>%
  dplyr::filter(fraction == "Whole") %>%
  ggplot(aes(x = station, y = Abundance, fill = Phylum)) + 
  facet_grid(.~date) + 
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Surface Phylum Composition") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

## Make each phyla it's own row 
phylum_df %>%
  # Warning: It's important to have one sample per x value, 
  # otherwise, it will take the sum between multiple samples
  dplyr::filter(depth == 0.0) %>%
  dplyr::filter(fraction == "Whole") %>%
  ggplot(aes(x = station, y = Abundance, fill = Phylum)) + 
  facet_grid(Phylum~date, scale = "free") + 
  geom_bar(stat = "identity", color = "black") + 
  labs(title = "Surface Phylum Composition") + 
  scale_fill_manual(values = phylum_colors) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        strip.text.y = element_text(size = 5))

# Narrow in on a specific group
# Actinobacteriota - y: abundance, x: station, dot plot + boxplot
phylum_df %>%
  dplyr::filter(Phylum == "Actinobacteriota") %>%
  # build the plot 
  ggplot(aes(x = station, y = Abundance, 
             fill = station, color = station)) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Actinobacteriota Phylum Abundance") + 
  scale_color_manual(values = station_colors) + 
  scale_fill_manual(values = station_colors) + 
    theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
          legend.position = "right")

# Statistically: Kruskall-Wallis followed by a Tukey's Posthoc test
# These are non-parametric (non-normal) stat tests 

Family

# Calculate the Family relative abundance 
# Note: The read depth MUST be normalized in some way: scale_reads
family_df <- 
  scaled_rooted_physeq %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Family") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) %>%
  # fix the order of date
  mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
         station = fct_relevel(station, c("Copano West", "Copano East",
                                          "Mesquite Bay", "Aransas Bay",
                                          "Shipping Channel")))
# Check family_df
#str(family_df)

# Plot family 
# Actinobacteriota Families 
family_df %>%
  dplyr::filter(Phylum == "Actinobacteriota") %>%
  # build the plot 
  ggplot(aes(x = station, y = Abundance, 
             fill = station, color = station)) + 
  facet_wrap(.~Family, scales = "free_y", nrow = 1) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Actinobacteriota Family Abundance") + 
  scale_color_manual(values = station_colors) + 
  scale_fill_manual(values = station_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

# Plot family 
family_df %>%
  dplyr::filter(Phylum == "Cyanobacteria") %>%
  # build the plot 
  ggplot(aes(x = station, y = Abundance, 
             fill = station, color = station)) + 
  facet_wrap(.~Family, scales = "free_y", nrow = 1) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Cyanobacteria Family Abundance") + 
  scale_color_manual(values = station_colors) + 
  scale_fill_manual(values = station_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

Genus

# Calculate the Family relative abundance 
# Note: The read depth MUST be normalized in some way: scale_reads
genus_df <- 
  scaled_rooted_physeq %>%
  # agglomerate at the phylum level 
  tax_glom(taxrank = "Genus") %>% 
  # Transform counts to relative abundance 
  transform_sample_counts(function (x) {x/sum(x)}) %>%
  # Melt to a long format 
  psmelt() %>%
  # Filter out Phyla that are <1% - get rid of low abundant Phyla
  dplyr::filter(Abundance > 0.01) %>%
  # fix the order of date
  mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
         station = fct_relevel(station, c("Copano West", "Copano East",
                                          "Mesquite Bay", "Aransas Bay",
                                          "Shipping Channel")))


# Actinobacteriota
# Plot genus 
genus_df %>%
  dplyr::filter(Phylum == "Actinobacteriota") %>%
  # build the plot 
  ggplot(aes(x = station, y = Abundance, 
             fill = station, color = station)) + 
  facet_wrap(.~Genus, scales = "free_y", nrow = 1) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Actinobacteriota Genus Abundance") + 
  scale_color_manual(values = station_colors) + 
  scale_fill_manual(values = station_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

# Cyanobacteria 
# Plot genus 
genus_df %>%
  dplyr::filter(Phylum == "Cyanobacteria") %>%
  # build the plot 
  ggplot(aes(x = station, y = Abundance, 
             fill = station, color = station)) + 
  facet_wrap(.~Genus, scales = "free_y", nrow = 1) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot 
  geom_jitter() + 
  theme_bw() + 
  labs(title = "Cyanobacteria Genus Abundance") + 
  scale_color_manual(values = station_colors) + 
  scale_fill_manual(values = station_colors) + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
        legend.position = "bottom")

Session Information

For reproducibility

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.2 (2023-10-31)
##  os       macOS Sonoma 14.4.1
##  system   aarch64, darwin20
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/New_York
##  date     2024-05-08
##  pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package          * version   date (UTC) lib source
##  ade4               1.7-22    2023-02-06 [1] CRAN (R 4.3.0)
##  ape                5.7-1     2023-03-13 [1] CRAN (R 4.3.0)
##  Biobase            2.62.0    2023-10-26 [1] Bioconductor
##  BiocGenerics       0.48.1    2023-11-02 [1] Bioconductor
##  biomformat         1.30.0    2023-10-26 [1] Bioconductor
##  Biostrings         2.70.1    2023-10-26 [1] Bioconductor
##  bitops             1.0-7     2021-04-24 [1] CRAN (R 4.3.0)
##  bslib              0.6.1     2023-11-28 [1] CRAN (R 4.3.1)
##  cachem             1.0.8     2023-05-01 [2] CRAN (R 4.3.0)
##  cli                3.6.2     2023-12-11 [1] CRAN (R 4.3.1)
##  cluster            2.1.6     2023-12-01 [1] CRAN (R 4.3.1)
##  codetools          0.2-19    2023-02-01 [2] CRAN (R 4.3.2)
##  colorspace         2.1-0     2023-01-23 [2] CRAN (R 4.3.0)
##  crayon             1.5.2     2022-09-29 [2] CRAN (R 4.3.0)
##  data.table         1.14.10   2023-12-08 [1] CRAN (R 4.3.1)
##  devtools         * 2.4.5     2022-10-11 [1] CRAN (R 4.3.0)
##  digest             0.6.33    2023-07-07 [2] CRAN (R 4.3.0)
##  dplyr            * 1.1.4     2023-11-17 [1] CRAN (R 4.3.1)
##  ellipsis           0.3.2     2021-04-29 [2] CRAN (R 4.3.0)
##  evaluate           0.23      2023-11-01 [1] CRAN (R 4.3.1)
##  fansi              1.0.6     2023-12-08 [1] CRAN (R 4.3.1)
##  farver             2.1.1     2022-07-06 [2] CRAN (R 4.3.0)
##  fastmap            1.1.1     2023-02-24 [2] CRAN (R 4.3.0)
##  forcats          * 1.0.0     2023-01-29 [2] CRAN (R 4.3.0)
##  foreach            1.5.2     2022-02-02 [1] CRAN (R 4.3.0)
##  fs                 1.6.3     2023-07-20 [2] CRAN (R 4.3.0)
##  generics           0.1.3     2022-07-05 [2] CRAN (R 4.3.0)
##  GenomeInfoDb       1.38.5    2023-12-30 [1] Bioconductor 3.18 (R 4.3.2)
##  GenomeInfoDbData   1.2.11    2024-01-10 [1] Bioconductor
##  ggplot2          * 3.4.4     2023-10-12 [1] CRAN (R 4.3.1)
##  glue               1.7.0     2024-01-09 [1] CRAN (R 4.3.1)
##  gtable             0.3.4     2023-08-21 [2] CRAN (R 4.3.0)
##  highr              0.10      2022-12-22 [2] CRAN (R 4.3.0)
##  hms                1.1.3     2023-03-21 [2] CRAN (R 4.3.0)
##  htmltools          0.5.7     2023-11-03 [1] CRAN (R 4.3.1)
##  htmlwidgets        1.6.4     2023-12-06 [1] CRAN (R 4.3.1)
##  httpuv             1.6.13    2023-12-06 [1] CRAN (R 4.3.1)
##  igraph             1.6.0     2023-12-11 [1] CRAN (R 4.3.1)
##  IRanges            2.36.0    2023-10-26 [1] Bioconductor
##  iterators          1.0.14    2022-02-05 [1] CRAN (R 4.3.0)
##  jquerylib          0.1.4     2021-04-26 [2] CRAN (R 4.3.0)
##  jsonlite           1.8.8     2023-12-04 [1] CRAN (R 4.3.1)
##  knitr              1.45      2023-10-30 [1] CRAN (R 4.3.1)
##  labeling           0.4.3     2023-08-29 [2] CRAN (R 4.3.0)
##  later              1.3.2     2023-12-06 [1] CRAN (R 4.3.1)
##  lattice          * 0.22-5    2023-10-24 [1] CRAN (R 4.3.1)
##  lifecycle          1.0.4     2023-11-07 [1] CRAN (R 4.3.1)
##  lubridate        * 1.9.3     2023-09-27 [1] CRAN (R 4.3.1)
##  magrittr           2.0.3     2022-03-30 [2] CRAN (R 4.3.0)
##  MASS               7.3-60    2023-05-04 [2] CRAN (R 4.3.2)
##  Matrix             1.6-4     2023-11-30 [1] CRAN (R 4.3.1)
##  memoise            2.0.1     2021-11-26 [2] CRAN (R 4.3.0)
##  mgcv               1.9-1     2023-12-21 [1] CRAN (R 4.3.1)
##  mime               0.12      2021-09-28 [2] CRAN (R 4.3.0)
##  miniUI             0.1.1.1   2018-05-18 [1] CRAN (R 4.3.0)
##  multtest           2.58.0    2023-10-26 [1] Bioconductor
##  munsell            0.5.0     2018-06-12 [2] CRAN (R 4.3.0)
##  nlme               3.1-164   2023-11-27 [1] CRAN (R 4.3.1)
##  pacman             0.5.1     2019-03-11 [2] CRAN (R 4.3.0)
##  patchwork        * 1.2.0     2024-01-08 [1] CRAN (R 4.3.1)
##  permute          * 0.9-7     2022-01-27 [1] CRAN (R 4.3.0)
##  phyloseq         * 1.46.0    2023-10-24 [1] Bioconductor
##  pillar             1.9.0     2023-03-22 [2] CRAN (R 4.3.0)
##  pkgbuild           1.4.3     2023-12-10 [1] CRAN (R 4.3.1)
##  pkgconfig          2.0.3     2019-09-22 [2] CRAN (R 4.3.0)
##  pkgload            1.3.3     2023-09-22 [1] CRAN (R 4.3.1)
##  plyr               1.8.9     2023-10-02 [1] CRAN (R 4.3.1)
##  profvis            0.3.8     2023-05-02 [1] CRAN (R 4.3.0)
##  promises           1.2.1     2023-08-10 [1] CRAN (R 4.3.0)
##  purrr            * 1.0.2     2023-08-10 [2] CRAN (R 4.3.0)
##  R6                 2.5.1     2021-08-19 [2] CRAN (R 4.3.0)
##  Rcpp               1.0.12    2024-01-09 [1] CRAN (R 4.3.1)
##  RCurl              1.98-1.14 2024-01-09 [1] CRAN (R 4.3.1)
##  readr            * 2.1.4     2023-02-10 [2] CRAN (R 4.3.0)
##  remotes            2.4.2.1   2023-07-18 [2] CRAN (R 4.3.0)
##  reshape2           1.4.4     2020-04-09 [2] CRAN (R 4.3.0)
##  rhdf5              2.46.1    2023-12-02 [1] Bioconductor 3.18 (R 4.3.2)
##  rhdf5filters       1.14.1    2023-12-16 [1] Bioconductor 3.18 (R 4.3.2)
##  Rhdf5lib           1.24.1    2023-12-12 [1] Bioconductor 3.18 (R 4.3.2)
##  rlang              1.1.2     2023-11-04 [1] CRAN (R 4.3.1)
##  rmarkdown          2.25      2023-09-18 [1] CRAN (R 4.3.1)
##  rstudioapi         0.15.0    2023-07-07 [2] CRAN (R 4.3.0)
##  S4Vectors          0.40.2    2023-11-25 [1] Bioconductor 3.18 (R 4.3.2)
##  sass               0.4.8     2023-12-06 [1] CRAN (R 4.3.1)
##  scales             1.3.0     2023-11-28 [1] CRAN (R 4.3.1)
##  sessioninfo        1.2.2     2021-12-06 [1] CRAN (R 4.3.0)
##  shiny              1.8.0     2023-11-17 [1] CRAN (R 4.3.1)
##  stringi            1.8.3     2023-12-11 [1] CRAN (R 4.3.1)
##  stringr          * 1.5.1     2023-11-14 [1] CRAN (R 4.3.1)
##  survival           3.5-7     2023-08-14 [2] CRAN (R 4.3.2)
##  tibble           * 3.2.1     2023-03-20 [2] CRAN (R 4.3.0)
##  tidyr            * 1.3.0     2023-01-24 [2] CRAN (R 4.3.0)
##  tidyselect         1.2.0     2022-10-10 [2] CRAN (R 4.3.0)
##  tidyverse        * 2.0.0     2023-02-22 [2] CRAN (R 4.3.0)
##  timechange         0.2.0     2023-01-11 [2] CRAN (R 4.3.0)
##  tzdb               0.4.0     2023-05-12 [2] CRAN (R 4.3.0)
##  urlchecker         1.0.1     2021-11-30 [1] CRAN (R 4.3.0)
##  usethis          * 2.2.2     2023-07-06 [1] CRAN (R 4.3.0)
##  utf8               1.2.4     2023-10-22 [1] CRAN (R 4.3.1)
##  vctrs              0.6.5     2023-12-01 [1] CRAN (R 4.3.1)
##  vegan            * 2.6-4     2022-10-11 [1] CRAN (R 4.3.0)
##  withr              2.5.2     2023-10-30 [1] CRAN (R 4.3.1)
##  xfun               0.41      2023-11-01 [1] CRAN (R 4.3.1)
##  xtable             1.8-4     2019-04-21 [1] CRAN (R 4.3.0)
##  XVector            0.42.0    2023-10-26 [1] Bioconductor
##  yaml               2.3.8     2023-12-11 [1] CRAN (R 4.3.1)
##  zlibbioc           1.48.0    2023-10-26 [1] Bioconductor
## 
##  [1] /Users/mls528/Library/R/arm64/4.3/library
##  [2] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
## 
## ──────────────────────────────────────────────────────────────────────────────